package net.big_oh.algorithms.fingerprint.winnowing;

import java.math.BigInteger;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

/*
Copyright (c) 2009 Dave Wingate dba Big-Oh Software (www.big-oh.net)

Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:

The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
*/

/**
 * <p>
 * A Java implementation of winnowing, a document fingerprinting algorithm.
 * Applying the winnowing algorithm to a source document results in the
 * generation of a set of fingerprint values, that can be used to identify
 * partial text duplications in other source documents (that have also been
 * fingerprinted).
 * </p>
 * <p>
 * Following are important arguments that you can vary to change the manner in
 * which your source documents are fingerprinted:
 * </p>
 * <ul>
 * <li>
 * k - The size of the k-grams generated by the fingerprinting process. Text
 * duplications shorter than size k will not be discovered. Note that in order
 * to compare the fingerprints for two documents (to detect duplication), both
 * documents must have been ingerprinted using the same value for k.</li>
 * <li>
 * t - The guarantee threshold. Text duplications of length t or greater are
 * guaranteed to be discoverable.</li>
 * </ul>
 * <p>
 * For guidance on setting k & t values, see
 * http://theory.stanford.edu/~aiken/publications/papers/sigmod03.pdf.
 * </p>
 * <p>
 * Before fingerprinting documents with the winnowing algorithm it is common to
 * pre-process those documents to remove whitespace and transform structural
 * elements to canonical form. For example, before fingerprinting a SQL
 * statement, one might remove comment lines (because they are "whitespace") as
 * well as rewrite join clauses in alphabetical order. This implementation of
 * winnowing provides the {@link WinnowingWhitespaceFilter} and
 * {@link WinnowingTextTransformer} classes to support whitespace removal and
 * source document transformation respectively.
 * </p>
 * 
 * @author davewingate
 * 
 */
public class WinnowingFingerprinter
{

	private final List<WinnowingWhitespaceFilter> whitespaceFilters;
	private final List<WinnowingTextTransformer> transformers;
	private final int k;
	private final int t;
	private final String hashingAlgorithm;

	/**
	 * 
	 * A default constructor that uses MD5 as the hashing algorithm and performs
	 * no preprocessing on source documents before fingerprinting.
	 * 
	 * @param k
	 *            The size of the document k-grams that will be fingerprinted.
	 *            Set k to the largest size that will not preclude the detection
	 *            of "interesting" duplicates.
	 * @param t
	 *            The guarantee threshold. Duplicates of length t or greater are
	 *            guaranteed to be detectable using the resulting document
	 *            fingerprints.
	 * @throws IllegalArgumentException
	 *             Thrown when the values for k or t are illegal. The value for
	 *             k must be greater than zero. The value for t canto be less
	 *             than k.
	 */
	public WinnowingFingerprinter(int k, int t) throws IllegalArgumentException
	{
		this(new ArrayList<WinnowingWhitespaceFilter>(0), new ArrayList<WinnowingTextTransformer>(0), k, t, "MD5");
	}

	/**
	 * 
	 * @param whitespaceFilters
	 *            The list of filters that will be used to remove whitespace
	 *            from documents before they are fingerprinted. The definition
	 *            of "whitespace" will vary across domains.
	 * @param transformers
	 *            The list of transformers that are used to translate source
	 *            documents to a canonical form prior to fingerprinting.
	 * @param k
	 *            The size of the document k-grams that will be fingerprinted.
	 *            Set k to the largest size that will not preclude the detection
	 *            of "interesting" duplicates.
	 * @param t
	 *            The guarantee threshold. Duplicates of length t or greater are
	 *            guaranteed to be detectable using the resulting document
	 *            fingerprints.
	 * @param hashingAlgorithm
	 *            You might use MD5 or SHA1. In any case, select a hashing
	 *            algorithm that makes accidental fingerprint collisions
	 *            extremely unlikely.
	 * @throws IllegalArgumentException
	 *             Thrown when the values for k or t are illegal. The value for
	 *             k must be greater than zero. The value for t canto be less
	 *             than k.
	 */
	public WinnowingFingerprinter(List<WinnowingWhitespaceFilter> whitespaceFilters, List<WinnowingTextTransformer> transformers, int k, int t, String hashingAlgorithm) throws IllegalArgumentException
	{
		super();
		this.whitespaceFilters = Collections.unmodifiableList(whitespaceFilters);
		this.transformers = Collections.unmodifiableList(transformers);
		this.k = k;
		this.t = t;
		this.hashingAlgorithm = hashingAlgorithm;

		if (k < 1)
		{
			throw new IllegalArgumentException("k cannot be less than one");
		}

		if (t < k)
		{
			throw new IllegalArgumentException("t cannot be less than k");
		}

	}

	/**
	 * 
	 * @param sourceDocument
	 * @return A set of BigInteger values that are the hashed k-grams selected
	 *         by the algorithm to be the source document's fingerprints.
	 * @throws NoSuchAlgorithmException
	 */
	public final Set<BigInteger> fingerprint(String sourceDocument) throws NoSuchAlgorithmException
	{

		// Copy the string reference, don't want to lose the original input
		String toFingerPrint = sourceDocument;

		// perform transformations
		for (WinnowingTextTransformer transformer : transformers)
		{
			toFingerPrint = transformer.doTransformation(toFingerPrint);
		}

		// remove whitespace
		for (WinnowingWhitespaceFilter filter : whitespaceFilters)
		{
			toFingerPrint = filter.doFilter(toFingerPrint);
		}

		// handle the exceptional case in which the filtered/transformed
		// originalInput is shorter than the value set for k
		if (toFingerPrint.length() < k)
		{
			return Collections.emptySet();
		}

		// generate k-grams
		List<String> orderedKGrams = generateKGrams(toFingerPrint);

		// hash kGrams
		List<BigInteger> hashedOrderedKGrams = new ArrayList<BigInteger>(orderedKGrams.size());
		for (String kg : orderedKGrams)
		{
			hashedOrderedKGrams.add(hashKGram(kg));
		}

		// calculate window size
		int w = t - k + 1;

		// prepare to select fingerprint hashes
		Set<BigInteger> fingerprintHashes = new HashSet<BigInteger>();

		// handle the exceptional case in which the window size is greater than
		// the number of hashedOrderedKGrams
		if (w > hashedOrderedKGrams.size())
		{

			fingerprintHashes.add(selectMin(hashedOrderedKGrams));

		}
		else
		{

			// select fingerprint hashes by sliding a window of size w across
			// hashedOrderedKGrams.
			for (int windowStartOffset = 0; windowStartOffset + w <= hashedOrderedKGrams.size(); windowStartOffset++)
			{
				fingerprintHashes.add(selectMin(hashedOrderedKGrams.subList(windowStartOffset, windowStartOffset + w)));
			}

		}

		return fingerprintHashes;

	}

	final BigInteger selectMin(List<BigInteger> integers)
	{
		if (integers.isEmpty())
		{
			throw new RuntimeException("unexpected");
		}

		BigInteger min = integers.get(0);

		for (BigInteger candidate : integers)
		{
			if (candidate.compareTo(min) < 0)
			{
				min = candidate;
			}
		}

		return min;

	}

	final List<String> generateKGrams(String toFingerPrint)
	{
		List<String> kGrams = new ArrayList<String>(toFingerPrint.length());

		// generate kGrams by sliding a window of size k across the characters
		// of toFingerPrint
		for (int windowStartOffset = 0; windowStartOffset + k <= toFingerPrint.length(); windowStartOffset++)
		{
			kGrams.add(toFingerPrint.substring(windowStartOffset, windowStartOffset + k));
		}

		return kGrams;
	}

	final BigInteger hashKGram(String kGram) throws NoSuchAlgorithmException
	{
		MessageDigest md = MessageDigest.getInstance(hashingAlgorithm);
		byte[] digestedKGram = md.digest(kGram.getBytes());
		return new BigInteger(1, digestedKGram);
	}

	/**
	 * 
	 * @return The size of the document k-grams that will be fingerprinted. Set
	 *         k to the largest size that will not preclude the detection of
	 *         "interesting" duplicates.
	 */
	public int getK()
	{
		return k;
	}

	/**
	 * 
	 * @return The guarantee threshold. Duplicates of length t or greater are
	 *         guaranteed to be detectable using the resulting document
	 *         fingerprints.
	 */
	public int getT()
	{
		return t;
	}

	/**
	 * 
	 * @return The hashing algorithm used to generate fingerprints for a source
	 *         document's k-grams.
	 */
	public String getHashingAlgorithm()
	{
		return hashingAlgorithm;
	}

	/**
	 * 
	 * @return The list of filters that will be used to remove whitespace from
	 *         documents before they are fingerprinted. The definition of
	 *         "whitespace" will vary across domains.
	 */
	public List<WinnowingWhitespaceFilter> getWhitespaceFilters()
	{
		return whitespaceFilters;
	}

	/**
	 * 
	 * @return The list of transformers that are used to translate source
	 *         documents to a canonical form prior to fingerprinting.
	 */
	public List<WinnowingTextTransformer> getTransformers()
	{
		return transformers;
	}

}
